import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objects as go
import plotly.offline as offline_py
from colour import Color
from plotly.subplots import make_subplots
from scipy.stats import kstest
######
df_CPFE3_SA = pd.read_csv('./energy_companies_stocks/CPFE3_SA.csv')
df_CPFE3_SA['ticker'] = 'CPFE3_SA'
######
df_CPLE6_SA = pd.read_csv('./energy_companies_stocks/CPLE6_SA.csv')
df_CPLE6_SA['ticker'] = 'CPLE6.SA'
######
df_ELET3_SA = pd.read_csv('./energy_companies_stocks/ELET3_SA.csv')
df_ELET3_SA['ticker'] = 'ELET3.SA'
######
df_CMIG4 = pd.read_csv('./energy_companies_stocks/CMIG4_SA.csv')
df_CMIG4['ticker'] = 'CMIG4.SA'
######
df_ENEV3 = pd.read_csv('./energy_companies_stocks/ENEV3_SA.csv')
df_ENEV3['ticker'] = 'ENEV3.SA'
df = pd.concat([df_CMIG4, df_CPFE3_SA, df_CPLE6_SA, df_ELET3_SA, df_ENEV3])
df.tail()
| Date | Open | High | Low | Close | Adj Close | Volume | ticker | |
|---|---|---|---|---|---|---|---|---|
| 2128 | 2021-07-29 | 16.790001 | 16.900000 | 16.510000 | 16.850000 | 16.850000 | 3703400.0 | ENEV3.SA |
| 2129 | 2021-07-30 | 16.660000 | 16.860001 | 16.270000 | 16.459999 | 16.459999 | 5206300.0 | ENEV3.SA |
| 2130 | 2021-08-02 | 16.680000 | 17.190001 | 16.459999 | 16.940001 | 16.940001 | 8433500.0 | ENEV3.SA |
| 2131 | 2021-08-03 | 16.940001 | 16.950001 | 16.520000 | 16.799999 | 16.799999 | 3835100.0 | ENEV3.SA |
| 2132 | 2021-08-04 | 16.809999 | 17.030001 | 16.520000 | 16.969999 | 16.969999 | 3710100.0 | ENEV3.SA |
adj_close = df.reset_index().pivot(index='Date', columns='ticker', values='Adj Close')
close = df.reset_index().pivot(index='Date', columns='ticker', values='Close')
high = df.reset_index().pivot(index='Date', columns='ticker', values='High')
low = df.reset_index().pivot(index='Date', columns='ticker', values='Low')
close.tail()
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-29 | 12.10 | 25.850000 | 6.21 | 42.290001 | 16.850000 |
| 2021-07-30 | 11.94 | 25.320000 | 6.12 | 40.250000 | 16.459999 |
| 2021-08-02 | 11.87 | 25.420000 | 6.35 | 40.310001 | 16.940001 |
| 2021-08-03 | 11.95 | 25.530001 | 6.30 | 40.529999 | 16.799999 |
| 2021-08-04 | 11.80 | 25.639999 | 6.34 | 40.459999 | 16.969999 |
low.tail()
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-29 | 12.05 | 25.730000 | 6.11 | 41.950001 | 16.510000 |
| 2021-07-30 | 11.88 | 25.280001 | 6.10 | 39.619999 | 16.270000 |
| 2021-08-02 | 11.86 | 25.420000 | 6.15 | 40.310001 | 16.459999 |
| 2021-08-03 | 11.67 | 25.120001 | 6.21 | 39.459999 | 16.520000 |
| 2021-08-04 | 11.73 | 25.320000 | 6.30 | 39.779999 | 16.520000 |
Let's see what a single stock looks like from the closing prices. For this example and future display examples in this project, we'll use ENEVA which is a power generation company based in Rio de Janeiro, Brazil.
If we tried to graph all the stocks, it would be too much information.
fig = go.Figure()
fig.add_trace(go.Scatter(x=close.index, y=adj_close['ENEV3.SA'], name='ENEV3.SA',
line=dict(color='firebrick', width=1)))
fig.update_layout(title='Stock Price - ENEVA - Adjusted Close',
xaxis_title='Date',
yaxis_title='Stock Price')
fig.show()
In this project you will code and evaluate a "breakout" signal. It is important to understand where these steps fit in the alpha research workflow. The signal-to-noise ratio in trading signals is very low and, as such, it is very easy to fall into the trap of overfitting to noise. It is therefore inadvisable to jump right into signal coding. To help mitigate overfitting, it is best to start with a general observation and hypothesis; i.e., you should be able to answer the following question before you touch any data:
What feature of markets or investor behaviour would lead to a persistent anomaly that my signal will try to use?
Ideally the assumptions behind the hypothesis will be testable before you actually code and evaluate the signal itself. The workflow therefore is as follows:

In this project, we assume that the first three steps area done ("observe & research", "form hypothesis", "validate hypothesis"). The hypothesis you'll be using for this project is the following:
Using this hypothesis, let start coding..
You'll use the price highs and lows as an indicator for the breakout strategy. In this section, implement get_high_lows_lookback to get the maximum high price and minimum low price over a window of days. The variable lookback_days contains the number of days to look in the past. Make sure this doesn't include the current day.
def get_high_lows_lookback(high, low, lookback_days):
"""
Get the highs and lows in a lookback window.
Parameters
----------
high : DataFrame
High price for each ticker and date
low : DataFrame
Low price for each ticker and date
lookback_days : int
The number of days to look back
Returns
-------
lookback_high : DataFrame
Lookback high price for each ticker and date
lookback_low : DataFrame
Lookback low price for each ticker and date
"""
lookback_high = high.rolling(lookback_days).max().shift()
lookback_low = low.rolling(lookback_days).min().shift()
return lookback_high, lookback_low
lookback_high, lookback_low = get_high_lows_lookback(high, low, 2)
print(get_high_lows_lookback(high, low, 2)[0].shape)
get_high_lows_lookback(high, low, 2)[0].tail()
(2133, 5)
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-29 | 12.38 | 26.629999 | 6.18 | 43.040001 | 16.830000 |
| 2021-07-30 | 12.46 | 26.629999 | 6.23 | 43.049999 | 16.900000 |
| 2021-08-02 | 12.46 | 26.580000 | 6.23 | 43.049999 | 16.900000 |
| 2021-08-03 | 12.15 | 25.840000 | 6.37 | 42.290001 | 17.190001 |
| 2021-08-04 | 12.15 | 25.840000 | 6.37 | 41.349998 | 17.190001 |
get_high_lows_lookback(high, low, 2)[1].tail()
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-29 | 12.00 | 25.910000 | 6.00 | 41.610001 | 16.230000 |
| 2021-07-30 | 12.05 | 25.730000 | 6.09 | 41.950001 | 16.500000 |
| 2021-08-02 | 11.88 | 25.280001 | 6.10 | 39.619999 | 16.270000 |
| 2021-08-03 | 11.86 | 25.280001 | 6.10 | 39.619999 | 16.270000 |
| 2021-08-04 | 11.67 | 25.120001 | 6.15 | 39.459999 | 16.459999 |
For the sake of comparison,
Let's use your implementation of get_high_lows_lookback to get the highs and lows for the past 50 days and compare it to it their respective stock. Just like last time, we'll use Apple's stock as the example to look at.
def plot_stock(ticker, high, low, close, adj_close, lookback_days):
"""
Plot High, Low, Close and Adjusted Close for specific ticker
Parameters
----------
ticker : string
Ticker ID
high : DataFrame
High price for each ticker and date
low : DataFrame
Low price for each ticker and date
close : DataFrame
Close price for each ticker and date
adj_close : DataFrame
Adjusted Close price for each ticker and date
Returns
-------
void: ticker's chart
"""
high.fillna(method ='ffill', inplace = True)
low.fillna(method ='ffill', inplace = True)
close.fillna(method ='ffill', inplace = True)
adj_close.fillna(method ='ffill', inplace = True)
lookback_high, lookback_low = get_high_lows_lookback(high, low, lookback_days)
fig = go.Figure()
fig.add_trace(go.Scatter(x=adj_close.index, y=adj_close[ticker], name='Adjusted Close',
line=dict(color='grey', width=1)))
fig.add_trace(go.Scatter(x=close.index, y=close[ticker], name='Close',
line=dict(color='black', width=1)))
fig.add_trace(go.Scatter(x=lookback_high.index, y=lookback_high[ticker], name='Lookback HIGH',
line=dict(color='blue', width=1)))
fig.add_trace(go.Scatter(x=lookback_low.index, y=lookback_low[ticker], name='Lookback LOW',
line=dict(color='firebrick', width=1)))
fig.update_layout(title='High, Low, Close and Adjusted Close for {} Stock'.format('CMIG4.SA'),
xaxis_title='Date',
yaxis_title='Stock Price')
fig.show()
plot_stock('CMIG4.SA', high, low, close, adj_close, 50)
Using the generated indicator of highs and lows, create long and short signals using a breakout strategy. Implement get_long_short to generate the following signals:
| Signal | Condition |
|---|---|
| -1 | Low > Close Price |
| 1 | High < Close Price |
| 0 | Otherwise |
In this chart, Close Price is the close parameter. Low and High are the values generated from get_high_lows_lookback, the lookback_high and lookback_low parameters.
def get_long_short(close, lookback_high, lookback_low):
"""
Generate the signals long, short, and do nothing.
Parameters
----------
close : DataFrame
Close price for each ticker and date
lookback_high : DataFrame
Lookback high price for each ticker and date
lookback_low : DataFrame
Lookback low price for each ticker and date
Returns
-------
long_short : DataFrame
The long, short, and do nothing signals for each ticker and date
"""
long_short = close.copy()
close_high = close.subtract(lookback_high)
long_short[close_high > 0] = 1
close_low = close.subtract(lookback_low)
long_short[close_low < 0] = -1
long_short[ (long_short!= 1) & (long_short != -1)] = 0
return long_short.astype('int64')
long_short = get_long_short(close, lookback_high, lookback_low)
long_short.tail(20)
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-07 | 0 | 0 | 0 | 0 | 1 |
| 2021-07-08 | -1 | 0 | 0 | 0 | 0 |
| 2021-07-12 | 0 | 0 | 0 | 1 | 1 |
| 2021-07-13 | 0 | 0 | 0 | 0 | 1 |
| 2021-07-14 | 0 | 0 | 0 | 1 | 1 |
| 2021-07-15 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-16 | -1 | 0 | 0 | 0 | -1 |
| 2021-07-19 | -1 | -1 | 0 | -1 | -1 |
| 2021-07-20 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-21 | 1 | 0 | 0 | 0 | 0 |
| 2021-07-22 | 1 | 1 | 1 | 0 | 0 |
| 2021-07-23 | 0 | 0 | 0 | -1 | 0 |
| 2021-07-26 | 0 | -1 | 0 | 0 | -1 |
| 2021-07-27 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-28 | 1 | 0 | 1 | 1 | 0 |
| 2021-07-29 | 0 | -1 | 1 | 0 | 1 |
| 2021-07-30 | -1 | -1 | 0 | -1 | -1 |
| 2021-08-02 | -1 | 0 | 1 | 0 | 1 |
| 2021-08-03 | 0 | 0 | 0 | 0 | 0 |
| 2021-08-04 | 0 | 0 | 0 | 0 | 0 |
color_scheme = {
'index': '#B6B2CF',
'etf': '#2D3ECF',
'tracking_error': '#6F91DE',
'df_header': 'silver',
'df_value': 'white',
'df_line': 'silver',
'heatmap_colorscale': [(0, '#6F91DE'), (0.5, 'grey'), (1, 'red')],
'background_label': '#9dbdd5',
'low_value': '#B6B2CF',
'high_value': '#2D3ECF',
'y_axis_2_text_color': 'grey',
'shadow': 'rgba(0, 0, 0, 0.75)',
'major_line': '#2D3ECF',
'minor_line': '#B6B2CF',
'main_line': 'black'}
def generate_config():
return {'showLink': False, 'displayModeBar': False, 'showAxisRangeEntryBoxes': True}
def _generate_stock_trace(prices):
return go.Scatter(
name='Index',
x=prices.index,
y=prices,
line={'color': color_scheme['main_line']})
def _generate_buy_annotations(prices, signal):
return [{
'x': index, 'y': price, 'text': 'Long', 'bgcolor': color_scheme['background_label'],
'ayref': 'y', 'ax': 0, 'ay': 20}
for index, price in prices[signal == 1].iteritems()]
def _generate_sell_annotations(prices, signal):
return [{
'x': index, 'y': price, 'text': 'Short', 'bgcolor': color_scheme['background_label'],
'ayref': 'y', 'ax': 0, 'ay': 160}
for index, price in prices[signal == -1].iteritems()]
def plot_signal(price, signal, title):
config = generate_config()
buy_annotations = _generate_buy_annotations(price, signal)
sell_annotations = _generate_sell_annotations(price, signal)
layout = go.Layout(
title=title,
annotations=buy_annotations + sell_annotations)
stock_trace = _generate_stock_trace(price)
offline_py.iplot({'data': [stock_trace], 'layout': layout}, config=config)
signal = get_long_short(close, lookback_high, lookback_low)
plot_signal(
close['CMIG4.SA'],
signal['CMIG4.SA'],
'Long and Short of {} Stock'.format('CEMIG'))
That was a lot of repeated signals! If we're already shorting a stock, having an additional signal to short a stock isn't helpful for this strategy. This also applies to additional long signals when the last signal was long.
Implement filter_signals to filter out repeated long or short signals within the lookahead_days. If the previous signal was the same, change the signal to 0 (do nothing signal). For example, say you have a single stock time series that is
[1, 0, 1, 0, 1, 0, -1, -1]
Running filter_signals with a lookahead of 3 days should turn those signals into
[1, 0, 0, 0, 1, 0, -1, 0]
To help you implement the function, we have provided you with the clear_signals function. This will remove all signals within a window after the last signal. For example, say you're using a windows size of 3 with clear_signals. It would turn the Series of long signals
[0, 1, 0, 0, 1, 1, 0, 1, 0]
into
[0, 1, 0, 0, 0, 1, 0, 0, 0]
clear_signals only takes a Series of the same type of signals, where 1 is the signal and 0 is no signal. It can't take a mix of long and short signals. Using this function, implement filter_signals.
For implementing filter_signals, we don't reccommend you try to find a vectorized solution. Instead, you should use the iterrows over each column.
def clear_signals(signals, window_size):
"""
Clear out signals in a Series of just long or short signals.
Remove the number of signals down to 1 within the window size time period.
Parameters
----------
signals : Pandas Series
The long, short, or do nothing signals
window_size : int
The number of days to have a single signal
Returns
-------
signals : Pandas Series
Signals with the signals removed from the window size
"""
# Start with buffer of window size
# This handles the edge case of calculating past_signal in the beginning
clean_signals = [0]*window_size
for signal_i, current_signal in enumerate(signals):
# Check if there was a signal in the past window_size of days
has_past_signal = bool(sum(clean_signals[signal_i:signal_i+window_size]))
# Use the current signal if there's no past signal, else 0/False
clean_signals.append(not has_past_signal and current_signal)
# Remove buffer
clean_signals = clean_signals[window_size:]
# Return the signals as a Series of Ints
return pd.Series(np.array(clean_signals).astype(int), signals.index)
def filter_signals(signal, lookahead_days):
"""
Filter out signals in a DataFrame.
Parameters
----------
signal : DataFrame
The long, short, and do nothing signals for each ticker and date
lookahead_days : int
The number of days to look ahead
Returns
-------
filtered_signal : DataFrame
The filtered long, short, and do nothing signals for each ticker and date
"""
filtered_signal = pd.DataFrame()
long_signal = signal[signal == 1].fillna(0)
short_signal = signal[signal == -1].fillna(0)
series_arr = []
count = 0
for col in signal.columns:
short = clear_signals(short_signal[col], lookahead_days)
long = clear_signals(long_signal[col], lookahead_days)
filtered_signal[col] = short + long
return filtered_signal
signal_3 = filter_signals(signal, 3)
signal_5 = filter_signals(signal, 5)
signal_10 = filter_signals(signal, 10)
signal_20 = filter_signals(signal, 20)
signal_20.tail(15)
| CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA | |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-15 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-16 | -1 | 0 | 0 | 0 | -1 |
| 2021-07-19 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-20 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-21 | 1 | 0 | 0 | 0 | 0 |
| 2021-07-22 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-23 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-26 | 0 | -1 | 0 | 0 | 0 |
| 2021-07-27 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-28 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-29 | 0 | 0 | 0 | 0 | 1 |
| 2021-07-30 | 0 | 0 | 0 | -1 | 0 |
| 2021-08-02 | 0 | 0 | 0 | 0 | 0 |
| 2021-08-03 | 0 | 0 | 0 | 0 | 0 |
| 2021-08-04 | 0 | 0 | 0 | 0 | 0 |
In order to compare both results
long_short.tail(15)
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-15 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-16 | -1 | 0 | 0 | 0 | -1 |
| 2021-07-19 | -1 | -1 | 0 | -1 | -1 |
| 2021-07-20 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-21 | 1 | 0 | 0 | 0 | 0 |
| 2021-07-22 | 1 | 1 | 1 | 0 | 0 |
| 2021-07-23 | 0 | 0 | 0 | -1 | 0 |
| 2021-07-26 | 0 | -1 | 0 | 0 | -1 |
| 2021-07-27 | 0 | 0 | 0 | 0 | 0 |
| 2021-07-28 | 1 | 0 | 1 | 1 | 0 |
| 2021-07-29 | 0 | -1 | 1 | 0 | 1 |
| 2021-07-30 | -1 | -1 | 0 | -1 | -1 |
| 2021-08-02 | -1 | 0 | 1 | 0 | 1 |
| 2021-08-03 | 0 | 0 | 0 | 0 | 0 |
| 2021-08-04 | 0 | 0 | 0 | 0 | 0 |
for signal_data, signal_days in [(signal_5, 5), (signal_10, 10), (signal_20, 20)]:
plot_signal(
close['CMIG4.SA'],
signal_data['CMIG4.SA'],
'Long and Short of {} Stock with {} day signal window'.format('CMIG4.SA', signal_days))
With the trading signal done, we can start working on evaluating how many days to short or long the stocks. In this problem, implement get_lookahead_prices to get the close price days ahead in time. You can get the number of days from the variable lookahead_days. We'll use the lookahead prices to calculate future returns in another problem.
def get_lookahead_prices(close, lookahead_days):
"""
Get the lookahead prices for `lookahead_days` number of days.
Parameters
----------
close : DataFrame
Close price for each ticker and date
lookahead_days : int
The number of days to look ahead
Returns
-------
lookahead_prices : DataFrame
The lookahead prices for each ticker and date
"""
return close.shift(-lookahead_days)
get_lookahead_prices(close, lookahead_days=3)
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2013-01-02 | 11.985459 | 19.464521 | 2.987 | 6.987344 | 153.900360 |
| 2013-01-03 | 11.657372 | 19.048573 | 2.898 | 6.397568 | 141.922592 |
| 2013-01-04 | 12.307830 | 19.473974 | 2.980 | 6.577500 | 145.683044 |
| 2013-01-07 | 12.517143 | 19.653589 | 3.040 | 6.597492 | 141.365479 |
| 2013-01-08 | 12.573672 | 19.587416 | 3.070 | 6.507527 | 141.086929 |
| ... | ... | ... | ... | ... | ... |
| 2021-07-29 | 11.950000 | 25.530001 | 6.300 | 40.529999 | 16.799999 |
| 2021-07-30 | 11.800000 | 25.639999 | 6.340 | 40.459999 | 16.969999 |
| 2021-08-02 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-03 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-04 | NaN | NaN | NaN | NaN | NaN |
2133 rows × 5 columns
lookahead_5 = get_lookahead_prices(close, 5)
lookahead_10 = get_lookahead_prices(close, 10)
lookahead_20 = get_lookahead_prices(close, 20)
lookahead_5.tail(10)
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2021-07-22 | 12.10 | 25.850000 | 6.21 | 42.290001 | 16.850000 |
| 2021-07-23 | 11.94 | 25.320000 | 6.12 | 40.250000 | 16.459999 |
| 2021-07-26 | 11.87 | 25.420000 | 6.35 | 40.310001 | 16.940001 |
| 2021-07-27 | 11.95 | 25.530001 | 6.30 | 40.529999 | 16.799999 |
| 2021-07-28 | 11.80 | 25.639999 | 6.34 | 40.459999 | 16.969999 |
| 2021-07-29 | NaN | NaN | NaN | NaN | NaN |
| 2021-07-30 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-02 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-03 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-04 | NaN | NaN | NaN | NaN | NaN |
Using the get_lookahead_prices function, let's generate lookahead closing prices for 5, 10, and 20 days.
Let's also chart a subsection of a few months of the Apple stock instead of years. This will allow you to view the differences between the 5, 10, and 20 day lookaheads. Otherwise, they will mesh together when looking at a chart that is zoomed out.
def plot_lookahead_prices(ticker, close, lookahead_x1, lookahead_x2, lookahead_x3):
"""
Plot three lookahead prices and close price for comparison
Parameters
----------
ticker: String
String for ticker identification
close : Dataframe
Close Datframe
lookahead_x1 : Dataframe
Datframe with X1 days ahead in time
lookahead_x2 : int
Datframe with X2 days ahead in time
lookahead_x3 : int
Datframe with X3 days ahead in time
Returns
-------
lookahead_prices : DataFrame
The lookahead prices for each ticker and date
"""
fig = go.Figure()
fig.add_trace(go.Scatter(x=close.index, y=close[ticker], name='close',
line=dict(color='green', width=1)))
fig.add_trace(go.Scatter(x=lookahead_x1.index, y=lookahead_x1[ticker], name='lookahead_5',
line=dict(color='red', width=1)))
fig.add_trace(go.Scatter(x=lookahead_x2.index, y=lookahead_x2[ticker], name='lookahead_10',
line=dict(color='black', width=1)))
fig.add_trace(go.Scatter(x=lookahead_x3.index, y=lookahead_x3[ticker], name='lookahead_20',
line=dict(color='blue', width=1)))
fig.update_layout(title='5, 10, and 20 day Lookahead Prices for Slice {} Stock'.format(ticker),
xaxis_title='Date',
yaxis_title='Stock Price')
fig.show()
plot_lookahead_prices('CMIG4.SA', close, lookahead_5, lookahead_10, lookahead_20)
Implement get_return_lookahead to generate the log price return between the closing price and the lookahead price.
def get_return_lookahead(close, lookahead_prices):
"""
Calculate the log returns from the lookahead days to the signal day.
Parameters
----------
close : DataFrame
Close price for each ticker and date
lookahead_prices : DataFrame
The lookahead prices for each ticker and date
Returns
-------
lookahead_returns : DataFrame
The lookahead log returns for each ticker and date
"""
return np.log(lookahead_prices) - np.log(close)
get_return_lookahead(close, lookahead_prices=lookahead_5)
| ticker | CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA |
|---|---|---|---|---|---|
| Date | |||||
| 2013-01-02 | -0.055422 | -0.072998 | -0.061808 | -0.022540 | -0.079013 |
| 2013-01-03 | -0.039863 | -0.054295 | -0.031728 | -0.055979 | -0.101115 |
| 2013-01-04 | 0.014953 | -0.026198 | -0.009725 | -0.081106 | -0.118989 |
| 2013-01-07 | 0.046560 | 0.015900 | 0.028059 | -0.065016 | -0.061587 |
| 2013-01-08 | 0.072516 | 0.029342 | 0.068671 | 0.032286 | 0.027107 |
| ... | ... | ... | ... | ... | ... |
| 2021-07-29 | NaN | NaN | NaN | NaN | NaN |
| 2021-07-30 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-02 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-03 | NaN | NaN | NaN | NaN | NaN |
| 2021-08-04 | NaN | NaN | NaN | NaN | NaN |
2133 rows × 5 columns
Using the same lookahead prices and same subsection of the Apple stock from the previous problem, we'll view the lookahead returns.
In order to view price returns on the same chart as the stock, a second y-axis will be added. When viewing this chart, the axis for the price of the stock will be on the left side, like previous charts. The axis for price returns will be located on the right side.
price_return_5 = get_return_lookahead(close, lookahead_5)
price_return_10 = get_return_lookahead(close, lookahead_10)
price_return_20 = get_return_lookahead(close, lookahead_20)
######
def plot_price_return(ticker, price_return_x1, price_return_x2, price_return_x3):
"""
Plot three price returns for specific ticker
Parameters
----------
ticker: String
String for ticker identification
price_return_x1 : Dataframe
Datframe with price return with X1 days ahead in time
price_return_x2 : Dataframe
Datframe with price return with X2 days ahead in time
price_return_x3 : Dataframe
Datframe with price return with X3 days ahead in time
Returns
-------
lookahead_prices : DataFrame
The lookahead prices for each ticker and date
"""
fig = go.Figure()
fig.add_trace(go.Scatter(x=price_return_x1.index, y=price_return_x1[ticker], name='price_return_5',
line=dict(color='grey', width=1)))
fig.add_trace(go.Scatter(x=price_return_x2.index, y=price_return_x2[ticker], name='price_return_10',
line=dict(color='black', width=1)))
fig.add_trace(go.Scatter(x=price_return_x3.index, y=price_return_x3[ticker], name='price_return_20',
line=dict(color='blue', width=1)))
fig.update_layout(title='5, 10, and 20 day Lookahead Returns for Slice {} Stock'.format(ticker),
xaxis_title='Date',
yaxis_title='Stock Price')
fig.show()
plot_price_return('CMIG4.SA', price_return_5, price_return_10, price_return_20)
Using the price returns generate the signal returns.
def get_signal_return(signal, lookahead_returns):
"""
Compute the signal returns.
Parameters
----------
signal : DataFrame
The long, short, and do nothing signals for each ticker and date
lookahead_returns : DataFrame
The lookahead log returns for each ticker and date
Returns
-------
signal_return : DataFrame
Signal returns for each ticker and date
"""
return signal*(lookahead_returns)
Let's continue using the previous lookahead prices to view the signal returns. Just like before, the axis for the signal returns is on the right side of the chart.
def plot_signal_returns(prices, signal_return_list, titles):
config = generate_config()
layout = go.Layout(
yaxis2={
'title': 'Signal Returns',
'titlefont': {'color': color_scheme['y_axis_2_text_color']},
'tickfont': {'color': color_scheme['y_axis_2_text_color']},
'overlaying': 'y',
'side': 'right'})
colors = Color(color_scheme['low_value'])\
.range_to(Color(color_scheme['high_value']), len(signal_return_list))
stock_trace = _generate_stock_trace(prices)
for (signal_return, signal, lookahead_days), color, title in zip(signal_return_list, colors, titles):
non_zero_signals = signal_return[signal_return != 0]
signal_return_trace = go.Scatter(
x=non_zero_signals.index,
y=non_zero_signals,
name='{} Day Lookahead'.format(lookahead_days),
line={'color': str(color)},
yaxis='y2')
buy_annotations = _generate_buy_annotations(prices, signal)
sell_annotations = _generate_sell_annotations(prices, signal)
layout['title'] = title
layout['annotations'] = buy_annotations + sell_annotations
offline_py.iplot({'data': [stock_trace, signal_return_trace], 'layout': layout}, config=config)
###########
title_string = '{} day LookaheadSignal Returns for {} Stock'
signal_return_5 = get_signal_return(signal_5, price_return_5)
signal_return_10 = get_signal_return(signal_10, price_return_10)
signal_return_20 = get_signal_return(signal_20, price_return_20)
plot_signal_returns(
close['CMIG4.SA'],
[
(signal_return_5['CMIG4.SA'], signal_5['CMIG4.SA'], 5),
(signal_return_10['CMIG4.SA'], signal_10['CMIG4.SA'], 10),
(signal_return_20['CMIG4.SA'], signal_20['CMIG4.SA'], 20)],
[title_string.format(5, 'CMIG4.SA'), title_string.format(10, 'CMIG4.SA'), title_string.format(20, 'CMIG4.SA')])
signal_return_5.head()
| CMIG4.SA | CPFE3_SA | CPLE6.SA | ELET3.SA | ENEV3.SA | |
|---|---|---|---|---|---|
| Date | |||||
| 2013-01-02 | -0.000000 | -0.0000 | -0.000000 | -0.000000 | -0.000000 |
| 2013-01-03 | -0.000000 | -0.0000 | -0.000000 | -0.000000 | -0.000000 |
| 2013-01-04 | -0.014953 | -0.0000 | 0.009725 | -0.081106 | -0.000000 |
| 2013-01-07 | 0.000000 | -0.0159 | 0.000000 | -0.000000 | -0.000000 |
| 2013-01-08 | 0.000000 | 0.0000 | 0.000000 | -0.032286 | -0.027107 |
def plot_signal_histograms(signal_list, title, subplot_titles):
assert len(signal_list) == len(subplot_titles)
signal_series_list = [signal.stack() for signal in signal_list]
all_values = pd.concat(signal_series_list)
x_range = [all_values.min(), all_values.max()]
y_range = [0, 1500]
config = generate_config()
colors = Color(color_scheme['low_value']).range_to(Color(color_scheme['high_value']), len(signal_series_list))
fig = py.subplots.make_subplots(rows=1, cols=len(signal_series_list), subplot_titles=subplot_titles, print_grid=False)
fig['layout'].update(title=title, showlegend=False)
for series_i, (signal_series, color) in enumerate(zip(signal_series_list, colors), 1):
filtered_series = signal_series[signal_series != 0].dropna()
trace = go.Histogram(x=filtered_series, marker={'color': str(color)})
fig.append_trace(trace, 1, series_i)
fig['layout']['xaxis{}'.format(series_i)].update(range=x_range)
fig['layout']['yaxis{}'.format(series_i)].update(range=y_range)
offline_py.iplot(fig, config=config)
########
plot_signal_histograms(
[signal_return_5, signal_return_10, signal_return_20],
'Signal Return',
('5 Days', '10 Days', '20 Days'))
You might have noticed the outliers in the 10 and 20 day histograms. To better visualize the outliers, let's compare the 5, 10, and 20 day signals returns to normal distributions with the same mean and deviation for each signal return distributions.
def plot_signal_to_normal_histograms(signal_list, title, subplot_titles):
assert len(signal_list) == len(subplot_titles)
signal_series_list = [signal.stack() for signal in signal_list]
all_values = pd.concat(signal_series_list)
x_range = [all_values.min(), all_values.max()]
y_range = [0, 1500]
config = generate_config()
fig = py.subplots.make_subplots(rows=1, cols=len(signal_series_list), subplot_titles=subplot_titles, print_grid=False)
fig['layout'].update(title=title)
for series_i, signal_series in enumerate(signal_series_list, 1):
filtered_series = signal_series[signal_series != 0].dropna()
filtered_series_trace = go.Histogram(
x=filtered_series,
marker={'color': color_scheme['low_value']},
name='Signal Return Distribution',
showlegend=False)
normal_trace = go.Histogram(
x=np.random.normal(np.mean(filtered_series), np.std(filtered_series), len(filtered_series)),
marker={'color': color_scheme['shadow']},
name='Normal Distribution',
showlegend=False)
fig.append_trace(filtered_series_trace, 1, series_i)
fig.append_trace(normal_trace, 1, series_i)
fig['layout']['xaxis{}'.format(series_i)].update(range=x_range)
fig['layout']['yaxis{}'.format(series_i)].update(range=y_range)
# Show legened
fig['data'][0]['showlegend'] = True
fig['data'][1]['showlegend'] = True
offline_py.iplot(fig, config=config)
#########
plot_signal_to_normal_histograms(
[signal_return_5, signal_return_10, signal_return_20],
'Signal Return',
('5 Days', '10 Days', '20 Days'))
While you can see the outliers in the histogram, we need to find the stocks that are causing these outlying returns. We'll use the Kolmogorov-Smirnov Test or KS-Test. This test will be applied to teach ticker's signal returns where a long or short signal exits.
# Filter out returns that don't have a long or short signal.
long_short_signal_returns_5 = signal_return_5[signal_5 != 0].stack()
long_short_signal_returns_10 = signal_return_10[signal_10 != 0].stack()
long_short_signal_returns_20 = signal_return_20[signal_20 != 0].stack()
# Get just ticker and signal return
long_short_signal_returns_5 = long_short_signal_returns_5.reset_index().iloc[:, [1,2]]
long_short_signal_returns_5.columns = ['ticker', 'signal_return']
long_short_signal_returns_10 = long_short_signal_returns_10.reset_index().iloc[:, [1,2]]
long_short_signal_returns_10.columns = ['ticker', 'signal_return']
long_short_signal_returns_20 = long_short_signal_returns_20.reset_index().iloc[:, [1,2]]
long_short_signal_returns_20.columns = ['ticker', 'signal_return']
# View some of the data
long_short_signal_returns_5.head(10)
| ticker | signal_return | |
|---|---|---|
| 0 | CMIG4.SA | -0.014953 |
| 1 | CPLE6.SA | 0.009725 |
| 2 | ELET3.SA | -0.081106 |
| 3 | CPFE3_SA | -0.015900 |
| 4 | ELET3.SA | -0.032286 |
| 5 | ENEV3.SA | -0.027107 |
| 6 | CMIG4.SA | 0.038559 |
| 7 | CPFE3_SA | 0.028919 |
| 8 | CPLE6.SA | 0.039980 |
| 9 | ENEV3.SA | 0.077173 |
This gives you the data to use in the KS-Test.
Now it's time to implement the function calculate_kstest to use Kolmogorov-Smirnov test (KS test) between a distribution of stock returns (the input dataframe in this case) and each stock's signal returns. Run KS test on a normal distribution against each stock's signal returns. Use scipy.stats.kstest perform the KS test. When calculating the standard deviation of the signal returns, make sure to set the delta degrees of freedom to 0.
For this function, we don't reccommend you try to find a vectorized solution. Instead, you should iterate over the groupby function.
def calculate_kstest(long_short_signal_returns):
"""
Calculate the KS-Test against the signal returns with a long or short signal.
Parameters
----------
long_short_signal_returns : DataFrame
The signal returns which have a signal.
This DataFrame contains two columns, "ticker" and "signal_return"
Returns
-------
ks_values : Pandas Series
KS static for all the tickers
p_values : Pandas Series
P value for all the tickers
"""
k_arr = []
p_arr = []
tickers_arr = []
for name, group in long_short_signal_returns.groupby('ticker'):
sub_group = group['signal_return'].values
k, p = kstest(rvs=sub_group,
cdf='norm',
args=(np.mean(long_short_signal_returns['signal_return']),
np.std(long_short_signal_returns['signal_return'])))
k_arr.append(k)
p_arr.append(p)
tickers_arr.append(name)
return pd.Series(data=k_arr, index=tickers_arr), \
pd.Series(data=p_arr, index=tickers_arr)
k_series, p_series = calculate_kstest(long_short_signal_returns=long_short_signal_returns_5)
k_series.head()
CMIG4.SA 0.079217 CPFE3_SA 0.180210 CPLE6.SA 0.102876 ELET3.SA 0.050170 ENEV3.SA 0.075240 dtype: float64
p_series.head()
CMIG4.SA 1.506033e-02 CPFE3_SA 5.095406e-10 CPLE6.SA 5.317106e-04 ELET3.SA 2.963644e-01 ENEV3.SA 5.125928e-02 dtype: float64
Using the signal returns we created above, let's calculate the ks and p values.
ks_values_5, p_values_5 = calculate_kstest(long_short_signal_returns_5)
ks_values_10, p_values_10 = calculate_kstest(long_short_signal_returns_10)
ks_values_20, p_values_20 = calculate_kstest(long_short_signal_returns_20)
print('ks_values_5')
print(ks_values_5.head(10))
print('p_values_5')
print(p_values_5.head(10))
ks_values_5 CMIG4.SA 0.079217 CPFE3_SA 0.180210 CPLE6.SA 0.102876 ELET3.SA 0.050170 ENEV3.SA 0.075240 dtype: float64 p_values_5 CMIG4.SA 1.506033e-02 CPFE3_SA 5.095406e-10 CPLE6.SA 5.317106e-04 ELET3.SA 2.963644e-01 ENEV3.SA 5.125928e-02 dtype: float64
for index, value in ks_values_5.items():
if value:
print(f"Index : {index}, Value : {value}")
Index : CMIG4.SA, Value : 0.07921675337431855 Index : CPFE3_SA, Value : 0.18021046208735403 Index : CPLE6.SA, Value : 0.10287568159141436 Index : ELET3.SA, Value : 0.05016996929779538 Index : ENEV3.SA, Value : 0.07523971503425697
for index, value in p_values_5.items():
if value:
print(f"Index : {index}, Value : {value}")
Index : CMIG4.SA, Value : 0.015060328270503603 Index : CPFE3_SA, Value : 5.095406324719296e-10 Index : CPLE6.SA, Value : 0.0005317105570371651 Index : ELET3.SA, Value : 0.2963643782764944 Index : ENEV3.SA, Value : 0.05125927651614515
With the ks and p values calculate, let's find which symbols are the outliers. Implement the find_outliers function to find the following outliers:
pvalue_threshold AND with a KS value above ks_threshold.Note: your function should return symbols that meet both requirements above.
def find_outliers(ks_values, p_values, ks_threshold, pvalue_threshold=0.05):
"""
Find outlying symbols using KS values and P-values
Parameters
----------
ks_values : Pandas Series
KS static for all the tickers
p_values : Pandas Series
P value for all the tickers
ks_threshold : float
The threshold for the KS statistic
pvalue_threshold : float
The threshold for the p-value
Returns
-------
outliers : set of str
Symbols that are outliers
"""
outliers_ks = set()
outliers_ps = set()
for index, ks_value in ks_values.items():
if ks_value > ks_threshold:
outliers_ks.add(index)
for index, p_value in p_values.items():
if p_value < pvalue_threshold:
outliers_ps.add(index)
return outliers_ps.intersection(outliers_ks)
ks_threshold = 0.8
outliers_5 = find_outliers(ks_values_5, p_values_5, ks_threshold)
outliers_10 = find_outliers(ks_values_10, p_values_10, ks_threshold)
outliers_20 = find_outliers(ks_values_20, p_values_20, ks_threshold)
outlier_tickers = outliers_5.union(outliers_10).union(outliers_20)
print('{} Outliers Found:\n{}'.format(len(outlier_tickers), ', '.join(list(outlier_tickers))))
0 Outliers Found:
Let's compare the 5, 10, and 20 day signals returns without outliers to normal distributions. Also, let's see how the P-Value has changed with the outliers removed.
good_tickers = list(set(close.columns) - outlier_tickers)
plot_signal_to_normal_histograms(
[signal_return_5[good_tickers], signal_return_10[good_tickers], signal_return_20[good_tickers]],
'Signal Return Without Outliers',
('5 Days', '10 Days', '20 Days'))